This files contains an example of tuning a Logistic Regression model with BayesSearchCV
import pickle
import time
import helpsk as hlp
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
import plotly.io as pio
pio.renderers.default='notebook'
with open('../X_train.pkl', 'rb') as handle:
X_train = pickle.load(handle)
with open('../y_train.pkl', 'rb') as handle:
y_train = pickle.load(handle)
hlp.pandas.numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | # of Zeros | % Zeros | Mean | St Dev. | Coef of Var | Skewness | Kurtosis | Min | 10% | 25% | 50% | 75% | 90% | Max | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| duration | 760 | 40 | 5.0% | 0 | 0.0% | 21.0 | 11.7 | 0.6 | 1.0 | 0.6 | 4.0 | 9.0 | 12.0 | 18.0 | 24.0 | 36.0 | 60.0 |
| credit_amount | 800 | 0 | 0.0% | 38 | 5.0% | 3,203.9 | 2,932.3 | 0.9 | 1.9 | 3.9 | 0.0 | 753.9 | 1,300.8 | 2,236.5 | 3,951.5 | 7,394.6 | 18,424.0 |
| installment_commitment | 800 | 0 | 0.0% | 0 | 0.0% | 3.0 | 1.1 | 0.4 | -0.5 | -1.2 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| residence_since | 800 | 0 | 0.0% | 0 | 0.0% | 2.9 | 1.1 | 0.4 | -0.3 | -1.4 | 1.0 | 1.0 | 2.0 | 3.0 | 4.0 | 4.0 | 4.0 |
| age | 800 | 0 | 0.0% | 0 | 0.0% | 35.6 | 11.4 | 0.3 | 1.0 | 0.7 | 19.0 | 23.0 | 27.0 | 33.0 | 42.0 | 52.0 | 75.0 |
| existing_credits | 800 | 0 | 0.0% | 0 | 0.0% | 1.4 | 0.6 | 0.4 | 1.3 | 1.6 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 | 4.0 |
| num_dependents | 800 | 0 | 0.0% | 0 | 0.0% | 1.1 | 0.3 | 0.3 | 2.0 | 2.1 | 1.0 | 1.0 | 1.0 | 1.0 | 1.0 | 2.0 | 2.0 |
hlp.pandas.non_numeric_summary(X_train)
| # of Non-Nulls | # of Nulls | % Nulls | Most Freq. Value | # of Unique | % Unique | |
|---|---|---|---|---|---|---|
| checking_status | 763 | 37 | 4.6% | no checking | 4 | 0.5% |
| credit_history | 800 | 0 | 0.0% | existing paid | 5 | 0.6% |
| purpose | 800 | 0 | 0.0% | radio/tv | 10 | 1.2% |
| savings_status | 800 | 0 | 0.0% | <100 | 5 | 0.6% |
| employment | 800 | 0 | 0.0% | 1<=X<4 | 5 | 0.6% |
| personal_status | 800 | 0 | 0.0% | male single | 4 | 0.5% |
| other_parties | 800 | 0 | 0.0% | none | 3 | 0.4% |
| property_magnitude | 800 | 0 | 0.0% | car | 4 | 0.5% |
| other_payment_plans | 800 | 0 | 0.0% | none | 3 | 0.4% |
| housing | 800 | 0 | 0.0% | own | 3 | 0.4% |
| job | 800 | 0 | 0.0% | skilled | 4 | 0.5% |
| own_telephone | 800 | 0 | 0.0% | none | 2 | 0.2% |
| foreign_worker | 800 | 0 | 0.0% | yes | 2 | 0.2% |
y_train[0:10]
array([1, 1, 0, 1, 0, 1, 0, 1, 1, 0])
np.unique(y_train, return_counts=True)
(array([0, 1]), array([559, 241]))
np.unique(y_train, return_counts=True)[1] / np.sum(np.unique(y_train, return_counts=True)[1])
array([0.69875, 0.30125])
search_space = hlp.sklearn_search.ClassifierSearchSpace(
data=X_train,
models=[hlp.sklearn_search.ClassifierSearchSpaceModels.LogisticRegression],
iterations=[50],
random_state=42,
)
# pip install scikit-optimize
from skopt import BayesSearchCV
#from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import RepeatedKFold
bayes_search = BayesSearchCV(
estimator=search_space.pipeline(),
search_spaces=search_space.search_spaces(),
cv=RepeatedKFold(n_splits=5, n_repeats=2),
scoring='roc_auc',
n_jobs=-1,
verbose=1,
random_state=42,
)
start_time = time.time()
bayes_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time
print(f"Elapsed time to run BayesSearchCV: {elapsed_time:.3f} seconds; {elapsed_time / 60:.1f} minutes")
Elapsed time to run BayesSearchCV: 56.335 seconds; 0.9 minutes
print(bayes_search.best_score_)
0.7736194546305393
print(bayes_search.best_params_)
OrderedDict([('model', LogisticRegression(C=0.1749996766322668, max_iter=1000, random_state=42)), ('model__C', 0.1749996766322668), ('prep__non_numeric__encoder__transformer', OneHotEncoder(handle_unknown='ignore')), ('prep__numeric__imputer__transformer', SimpleImputer()), ('prep__numeric__scaler__transformer', StandardScaler())])
results = hlp.sklearn_eval.MLExperimentResults.from_sklearn_search_cv(
searcher=bayes_search,
higher_score_is_better = True,
parameter_name_mappings = search_space.param_name_mappings()
)
results.to_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results = hlp.sklearn_eval.MLExperimentResults.from_yaml_file(yaml_file_name = 'Run 1 - Logistic Regression - BayesSearchCV.yaml')
results.best_score
0.7736194546305393
results.best_params
{'model': 'LogisticRegression()',
'C': 0.1749996766322668,
'imputer': 'SimpleImputer()',
'scaler': 'StandardScaler()',
'encoder': 'OneHotEncoder()'}
results.to_formatted_dataframe(num_rows=100, include_rank=True)
| rank | roc_auc Mean | roc_auc 95CI.LO | roc_auc 95CI.HI | C | imputer | scaler | encoder |
|---|---|---|---|---|---|---|---|
| 1 | 0.774 | 0.745 | 0.802 | 0.175 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 0.772 | 0.746 | 0.797 | 0.175 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 3 | 0.771 | 0.744 | 0.799 | 0.046 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 4 | 0.771 | 0.755 | 0.786 | 0.238 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 5 | 0.771 | 0.750 | 0.791 | 0.022 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 6 | 0.770 | 0.740 | 0.801 | 0.078 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 7 | 0.770 | 0.750 | 0.791 | 0.284 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 8 | 0.767 | 0.745 | 0.789 | 0.109 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 9 | 0.767 | 0.736 | 0.797 | 1.548 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 10 | 0.767 | 0.739 | 0.794 | 0.240 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 11 | 0.766 | 0.752 | 0.780 | 1.596 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 12 | 0.766 | 0.745 | 0.787 | 1.137 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 13 | 0.766 | 0.740 | 0.791 | 0.141 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 14 | 0.766 | 0.743 | 0.788 | 0.213 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 15 | 0.765 | 0.753 | 0.777 | 0.099 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 16 | 0.765 | 0.733 | 0.797 | 1.593 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 17 | 0.764 | 0.738 | 0.789 | 0.234 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 18 | 0.764 | 0.737 | 0.790 | 0.035 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 19 | 0.763 | 0.748 | 0.779 | 22.376 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 20 | 0.762 | 0.739 | 0.785 | 3.170 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 21 | 0.762 | 0.726 | 0.798 | <NA> | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 22 | 0.762 | 0.735 | 0.789 | 0.769 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 23 | 0.762 | 0.732 | 0.792 | 0.454 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 24 | 0.760 | 0.737 | 0.782 | 99.780 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 25 | 0.759 | 0.725 | 0.794 | 99.886 | SimpleImputer(strategy='median') | StandardScaler() | OneHotEncoder() |
| 26 | 0.759 | 0.741 | 0.777 | 0.307 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 27 | 0.758 | 0.730 | 0.787 | 0.158 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 28 | 0.758 | 0.724 | 0.792 | 0.005 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 29 | 0.758 | 0.735 | 0.782 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 30 | 0.757 | 0.742 | 0.773 | 99.531 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 31 | 0.756 | 0.740 | 0.773 | 0.010 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 32 | 0.756 | 0.729 | 0.782 | 0.000 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 33 | 0.754 | 0.718 | 0.790 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 34 | 0.753 | 0.727 | 0.779 | 0.001 | SimpleImputer(strategy='median') | MinMaxScaler() | OneHotEncoder() |
| 35 | 0.753 | 0.724 | 0.782 | 99.886 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 36 | 0.752 | 0.717 | 0.787 | 32.731 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 37 | 0.752 | 0.725 | 0.779 | 0.005 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 38 | 0.752 | 0.725 | 0.779 | 22.913 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | OneHotEncoder() |
| 39 | 0.748 | 0.730 | 0.766 | 0.000 | SimpleImputer() | MinMaxScaler() | OneHotEncoder() |
| 40 | 0.732 | 0.714 | 0.751 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 41 | 0.732 | 0.708 | 0.756 | 0.403 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 42 | 0.732 | 0.703 | 0.760 | 0.003 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 43 | 0.731 | 0.708 | 0.754 | 99.935 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
| 44 | 0.731 | 0.710 | 0.752 | 3.489 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 45 | 0.730 | 0.711 | 0.748 | 99.180 | SimpleImputer(strategy='median') | StandardScaler() | CustomOrdinalEncoder() |
| 46 | 0.728 | 0.695 | 0.762 | 0.108 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 47 | 0.728 | 0.698 | 0.759 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 48 | 0.722 | 0.703 | 0.740 | 0.000 | SimpleImputer() | StandardScaler() | CustomOrdinalEncoder() |
| 49 | 0.720 | 0.696 | 0.743 | 0.000 | SimpleImputer(strategy='most_frequent') | StandardScaler() | CustomOrdinalEncoder() |
| 50 | 0.704 | 0.677 | 0.732 | 0.000 | SimpleImputer(strategy='most_frequent') | MinMaxScaler() | CustomOrdinalEncoder() |
| 51 | 0.700 | 0.658 | 0.742 | 0.000 | SimpleImputer(strategy='median') | MinMaxScaler() | CustomOrdinalEncoder() |
# gives the score rank for each index
# e.g. array([4, 2, 1, 3)
# the 1st iteration (i.e. set of params) was the worst
# the 3rd iteration was the best.
results.trial_rankings
array([21, 19, 49, 42, 36, 11, 38, 41, 44, 34, 51, 47, 6, 13, 3, 37, 26,
43, 7, 23, 18, 25, 22, 24, 32, 33, 50, 1, 45, 40, 8, 31, 16, 48,
27, 39, 12, 10, 30, 35, 20, 14, 15, 9, 46, 4, 17, 29, 28, 5, 2])
# gives the
# e.g. results.best_trial_indexes of array([4, 2, 1, 3)
# would return [2, 1, 4, 0] because trial 2 (i.e. 3rd trial) was the best, so it is the first index;
# and index 0 (i.e. first trial) was the was
results.best_trial_indexes
array([27, 50, 14, 45, 49, 12, 18, 30, 43, 37, 5, 36, 13, 41, 42, 32, 46,
20, 1, 40, 0, 22, 19, 23, 21, 16, 34, 48, 47, 38, 31, 24, 25, 9,
39, 4, 15, 6, 35, 29, 7, 3, 17, 8, 28, 44, 11, 33, 2, 26, 10])
results.plot_performance_across_trials().show()
results.plot_performance_across_trials(size=None, color='C').show()
results.plot_performance_across_trials(size='C', color='scaler').show()
results.plot_parameter_values_across_trials().show()
results.plot_scatter_matrix(height=800, width=800 * hlp.plot.GOLDEN_RATIO).show()
results.plot_performance_numeric_params()
results.plot_parallel_coordinates().show()
results.plot_performance_non_numeric_params()
results.plot_score_vs_parameter(
parameter='C',
color='scaler'
)
results.plot_score_vs_parameter(
parameter='C',
color='encoder'
)
roc_auc Mean¶score_variable = results.primary_score_name + ' Mean'
score_dataframe = results.to_dataframe()
score_dataframe = score_dataframe.drop(columns=[x for x in score_dataframe.columns
if x not in [score_variable] + results.parameter_names])
score_dataframe.head()
| roc_auc Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 27 | 0.773619 | 0.175000 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 50 | 0.771631 | 0.174652 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 14 | 0.771423 | 0.046366 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 45 | 0.770928 | 0.237651 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 49 | 0.770668 | 0.022242 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
cleaned_column_names = [''.join(e for e in x.replace(' ', '_') if e == '_' or e.isalnum()) for x in score_dataframe.columns.tolist()]
cleaned_column_names = dict(zip(score_dataframe.columns.tolist(), cleaned_column_names))
cleaned_column_names
{'roc_auc Mean': 'roc_auc_Mean',
'C': 'C',
'imputer': 'imputer',
'scaler': 'scaler',
'encoder': 'encoder'}
score_dataframe = score_dataframe.rename(columns=cleaned_column_names)
import statsmodels.formula.api as smf
y_column = 'roc_auc_Mean'
X_columns = score_dataframe.columns.tolist()
X_columns.remove(y_column)
X_columns = hlp.string.collapse(X_columns, separate=" + ", surround="")
formula = f"{y_column} ~ {X_columns}"
print(formula)
model = smf.ols(formula=formula, data = score_dataframe)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.735
Model: OLS Adj. R-squared: 0.705
Method: Least Squares F-statistic: 24.37
Date: Sat, 12 Feb 2022 Prob (F-statistic): 1.14e-11
Time: 17:24:33 Log-Likelihood: 163.25
No. Observations: 50 AIC: -314.5
Df Residuals: 44 BIC: -303.0
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept 0.7232 0.005 143.176 0.000 0.713 0.733
imputer[T.SimpleImputer(strategy='median')] -0.0011 0.005 -0.235 0.815 -0.011 0.008
imputer[T.SimpleImputer(strategy='most_frequent')] -0.0060 0.003 -1.877 0.067 -0.012 0.000
scaler[T.StandardScaler()] 0.0035 0.003 1.210 0.233 -0.002 0.009
encoder[T.OneHotEncoder()] 0.0377 0.004 9.114 0.000 0.029 0.046
C 2.638e-06 4.54e-05 0.058 0.954 -8.88e-05 9.4e-05
==============================================================================
Omnibus: 17.523 Durbin-Watson: 1.125
Prob(Omnibus): 0.000 Jarque-Bera (JB): 21.259
Skew: -1.334 Prob(JB): 2.42e-05
Kurtosis: 4.757 Cond. No. 186.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
scaler = StandardScaler()
#scaler.fit_transform(bayes_search_df)
numeric_columns = hlp.pandas.get_numeric_columns(score_dataframe)
non_numeric_columns = hlp.pandas.get_non_numeric_columns(score_dataframe)
print(numeric_columns)
print(non_numeric_columns)
numeric_pipeline = Pipeline([
('scaling', StandardScaler()),
])
transformations_pipeline = ColumnTransformer([
('numeric_pipeline', numeric_pipeline, numeric_columns),
('non_numeric_pipeline', 'passthrough', non_numeric_columns)
])
score_dataframe_transformed = transformations_pipeline.fit_transform(score_dataframe)
score_dataframe_transformed = pd.DataFrame(score_dataframe_transformed,
columns= numeric_columns + non_numeric_columns)
score_dataframe_transformed.head()
['roc_auc_Mean', 'C'] ['imputer', 'scaler', 'encoder']
| roc_auc_Mean | C | imputer | scaler | encoder | |
|---|---|---|---|---|---|
| 0 | 1.151779 | -0.423727 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 1 | 1.040178 | -0.423738 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 2 | 1.028505 | -0.42771 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
| 3 | 1.000682 | -0.421788 | SimpleImputer() | StandardScaler() | OneHotEncoder() |
| 4 | 0.986078 | -0.428457 | SimpleImputer(strategy='most_frequent') | StandardScaler() | OneHotEncoder() |
score_dataframe_transformed['roc_auc_Mean'] = score_dataframe_transformed['roc_auc_Mean'].astype('float')
score_dataframe_transformed['C'] = score_dataframe_transformed['C'].astype('float')
print(formula)
model = smf.ols(formula=formula,
data = score_dataframe_transformed)
results = model.fit()
print(results.summary())
roc_auc_Mean ~ C + imputer + scaler + encoder
OLS Regression Results
==============================================================================
Dep. Variable: roc_auc_Mean R-squared: 0.735
Model: OLS Adj. R-squared: 0.705
Method: Least Squares F-statistic: 24.37
Date: Sat, 12 Feb 2022 Prob (F-statistic): 1.14e-11
Time: 17:24:35 Log-Likelihood: -38.138
No. Observations: 50 AIC: 88.28
Df Residuals: 44 BIC: 99.75
Df Model: 5
Covariance Type: nonrobust
======================================================================================================================
coef std err t P>|t| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------
Intercept -1.6772 0.286 -5.871 0.000 -2.253 -1.101
imputer[T.SimpleImputer(strategy='median')] -0.0621 0.264 -0.235 0.815 -0.594 0.470
imputer[T.SimpleImputer(strategy='most_frequent')] -0.3372 0.180 -1.877 0.067 -0.699 0.025
scaler[T.StandardScaler()] 0.1961 0.162 1.210 0.233 -0.130 0.523
encoder[T.OneHotEncoder()] 2.1138 0.232 9.114 0.000 1.646 2.581
C 0.0048 0.082 0.058 0.954 -0.161 0.170
==============================================================================
Omnibus: 17.523 Durbin-Watson: 1.125
Prob(Omnibus): 0.000 Jarque-Bera (JB): 21.259
Skew: -1.334 Prob(JB): 2.42e-05
Kurtosis: 4.757 Cond. No. 7.94
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
coefficients = pd.DataFrame({
'feature': results.params.index,
'coefficient': results.params,
'p_value': results.pvalues,
})
coefficients = coefficients.query("feature != 'Intercept'")
coefficients['Stat Sig'] = coefficients['p_value'] <= 0.05
coefficients
| feature | coefficient | p_value | Stat Sig | |
|---|---|---|---|---|
| imputer[T.SimpleImputer(strategy='median')] | imputer[T.SimpleImputer(strategy='median')] | -0.062118 | 8.150377e-01 | False |
| imputer[T.SimpleImputer(strategy='most_frequent')] | imputer[T.SimpleImputer(strategy='most_frequen... | -0.337201 | 6.720605e-02 | False |
| scaler[T.StandardScaler()] | scaler[T.StandardScaler()] | 0.196125 | 2.326576e-01 | False |
| encoder[T.OneHotEncoder()] | encoder[T.OneHotEncoder()] | 2.113756 | 1.079491e-11 | True |
| C | C | 0.004783 | 9.538768e-01 | False |
score_variable
'roc_auc Mean'
px.bar(
data_frame=coefficients.reindex(coefficients['coefficient'].abs().sort_values(ascending=True).index),
y='feature',
x='coefficient',
color='Stat Sig',
title=f"Regression Coefficients of Hyper-parameters against '{score_variable}'",
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
from sklearn.inspection import permutation_importance
forest = bayes_search.best_estimator_['model']
start_time = time.time()
result = permutation_importance(
bayes_search.best_estimator_, X_train, y_train, n_repeats=10, random_state=42, n_jobs=2
)
elapsed_time = time.time() - start_time
print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")
feature_names = X_train.columns.to_list()
forest_importances = pd.Series(result.importances_mean, index=feature_names)
forest_importances = forest_importances.sort_values(ascending=False)
Elapsed time to compute the importances: 3.138 seconds
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=result.importances_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.set_size_inches(9, 6)
fig.tight_layout()
plt.show()
temp = X_train.copy()
temp['default'] = y_train
temp.groupby('foreign_worker').agg({'default': np.mean})
| default | |
|---|---|
| foreign_worker | |
| yes | 0.308290 |
| no | 0.107143 |
fig = px.box(
data_frame=temp,
y='age',
x='default',
# size=size_variable,
# color=color_variable,
# trendline='lowess',
# labels={
# score_variable: f"Average Cross Validation Score ({parser.primary_score_name})",
# },
# title=f"<b>{x_variable}</b> - Performance<br>" \
# f"<sup>Size of point corresponds to '{size_variable}'</sup>",
# custom_data=['labels'],
height=600,
width=600*hlp.plot.GOLDEN_RATIO
)
fig.show()
NOTE: foreign worker seems like it should be important but is ranked last in feature importance.